##=================================================##
## Title: Validating Vote-by-Mail Ballots:         ##
## The Subjective Nature of Signature Verification ##
## Author: Enrijeta Shino and Daniel A. Smith      ##
## Journal: Political Research Quarterly           ##
##=================================================##

rm(list = ls())

# Load libraries
library(data.table)
library(sandwich)
library(lmtest)
library(ggplot2)
library(ggeffects)
library(tidyr)
library(dplyr)
library(texreg)
library(margins)
library(scales)
library(fastDummies)


# Load data
vf_survey <- load("/Users/Enrijeta/Dropbox/ValidatingVotebyMail/data/survey/VF_Survey_OriginalVars.RData")
vf_survey <- original_vars
rm(original_vars)

# Convert to data.table
vf_survey <- data.table(vf_survey)

# Keep only those who completed the survey
vf_survey <- vf_survey[Finished == TRUE]


##==================##
## Recode variables ##
##==================##

# DV: Signature match: 1 = No match (correct); 0 = Yes match (incorrect)
vf_survey$signature_match <- with(vf_survey, ifelse(SIGNATURE_MATCH == "No", 1,
                                             ifelse(SIGNATURE_MATCH == "Yes", 0, NA)))

# Vote choice: 1 = Trump; 0 = Biden/Other
vf_survey$vote_choice <- with(vf_survey, ifelse(VOTE20.1 == "Donald J. Trump (R)", 1,
                                         ifelse(VOTE20.1 == "Joe Biden (D)" | VOTE20.1 == "Other", 0, NA)))

# Party registration (voter file): 1 = Dem; 2 = Rep; 3 = NPA/Other
vf_survey$party_reg <- with(vf_survey, ifelse(PartyAffiliation == "DEM", 1,
                                       ifelse(PartyAffiliation == "REP", 2,
                                       ifelse(PartyAffiliation == "NPA" | PartyAffiliation == "Other", 3, NA))))
vf_survey$party_reg <- factor(vf_survey$party_reg)

# Ideology: 1 = Very Liberal ---> 7 = Very Conservative
vf_survey$ideo7 <- with(vf_survey, ifelse(IDEOLOGY == "Very Liberal", 1,
                                   ifelse(IDEOLOGY == "Slightly Liberal", 2,
                                   ifelse(IDEOLOGY == "Liberal", 3,
                                   ifelse(IDEOLOGY == "Moderate", 4,
                                   ifelse(IDEOLOGY == "Slightly Conservative", 5,
                                   ifelse(IDEOLOGY == "Conservative", 6,
                                   ifelse(IDEOLOGY == "Very Conservative", 7, NA))))))))

# Race (voter file): 1 = White; 0 = Other
vf_survey$white <- with(vf_survey, ifelse(Race == "White", 1, 0))

# Age categories (voter file): 1 = 18-24; 2 = 25-34; 3 = 35-44; 4 = 45-54; 5 = 55-64; 6 = 65+
vf_survey$age_cat <- with(vf_survey, ifelse(Age >= 18 & Age < 25, 1,
                                     ifelse(Age >= 25 & Age < 35, 2,
                                     ifelse(Age >= 35 & Age < 45, 3,
                                     ifelse(Age >= 45 & Age < 55, 4,
                                     ifelse(Age >= 55 & Age < 65, 5,
                                     ifelse(Age >= 65, 6, NA)))))))
vf_survey$age_cat <- factor(vf_survey$age_cat)

# Gender (voter file): 1 = Female; 0 = Male or unknown
vf_survey$female <- with(vf_survey, ifelse(Gender == "F", 1, 0))

# Education: 1 = HS or less; 2 = Some college; 3 = College degree; 4 = Graduate degree
vf_survey$education <- with(vf_survey, ifelse(EDUC == "Less than a high school degree" |
                                              EDUC == "High school graduate or equivalent (ex. GED)", 1,
                                       ifelse(EDUC == "Some college (including Associate Degree)", 2,
                                       ifelse(EDUC == "College graduate (Bachelor????Ts Degree)" |
                                              EDUC == "College graduate (Bachelorâ€™s Degree)", 3,
                                       ifelse(EDUC == "Graduate degree (or terminal degree)" |
                                              EDUC == "Some graduate work", 4, NA)))))
vf_survey$education <- factor(vf_survey$education)

# VBM vote 2020 (voter file): 1 = VBM; 0 = In-person
vf_survey$vbm20_vf <- with(vf_survey, ifelse(voted2020GE == "A" | voted2020GE == "B", 1, 0))

# VBM receive opinion: 1 = Excuse only; 0 = All/Request
vf_survey$vbm_receive_who <- with(vf_survey, ifelse(VBM_RECIEVE == "Only registered voters who have a valid excuse should receive one", 1,
                                             ifelse(VBM_RECIEVE == "Only registered voters who request a mail ballot should receive one" |
                                                    VBM_RECIEVE == "All registered voters should receive one", 0, NA)))

# Check writing frequency: 1 = Never ---> 5 = Within the last week
vf_survey$write_check <- with(vf_survey, ifelse(CHECK == "Never", 1,
                                         ifelse(CHECK == "More than a  year", 2,
                                         ifelse(CHECK == "More than a month", 3,
                                         ifelse(CHECK == "More than a week", 4,
                                         ifelse(CHECK == "Within the last week", 5, NA))))))

# Disability type: 0 = Other/None; 1 = Vision loss; 2 = Arthritis
vf_survey$disability_type <- with(vf_survey, ifelse(grepl("Arthritis \\(in your hands\\)", DISABILITY), 2,
                                             ifelse(grepl("Vision loss", DISABILITY), 1, 0)))
vf_survey$disability_type <- factor(vf_survey$disability_type)

# Other language at home: 1 = Yes; 0 = No
vf_survey$langugae_other_english <- with(vf_survey, ifelse(HOME_LANGUAGE == "No", 0,
                                                    ifelse(HOME_LANGUAGE == "Yes", 1, NA)))

##====================================##
## Keep only analysis variables       ##
##====================================##

analysis_vars <- c("signature_match", "vote_choice", "party_reg", "ideo7", "white",
                   "age_cat", "female", "education", "vbm20_vf", "vbm_receive_who",
                   "write_check", "disability_type", "langugae_other_english", "CountyCode")

vf_survey <- vf_survey[, ..analysis_vars]

##==============================##
## Descriptive Plot: All Sample ##
##==============================##

prop_data <- vf_survey %>%
  filter(!is.na(signature_match)) %>%
  mutate(signature_match = factor(signature_match,
                                  levels = c(0, 1),
                                  labels = c("Match", "No Match"))) %>%
  count(signature_match) %>%
  mutate(
    prop = n / sum(n),
    z = qnorm(0.975),
    total = sum(n),
    ci_lower = (prop + z^2/(2*total) - z*sqrt(prop*(1-prop)/total + z^2/(4*total^2))) / (1 + z^2/total),
    ci_upper = (prop + z^2/(2*total) + z*sqrt(prop*(1-prop)/total + z^2/(4*total^2))) / (1 + z^2/total)
  )

p1 <- ggplot(prop_data, aes(x = signature_match, y = prop, fill = signature_match)) +
  geom_col(alpha = 0.8, color = "white", size = 0.5) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper),
                width = 0.2, size = 0.8, color = "black") +
  geom_text(aes(label = paste0(round(prop * 100, 1), "%")),
            vjust = -1.5, size = 4, fontface = "bold") +
  scale_fill_manual(name = "", values = c("Match" = "gray65", "No Match" = "gray25")) +
  scale_y_continuous(labels = scales::percent_format(),
                     breaks = seq(0, 0.7, 0.1),
                     limits = c(0, 0.7),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(x = "Signature Match Status", y = "Percent") +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 11),
    legend.position = "bottom",
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", color = NA),
    plot.background = element_rect(fill = "white", color = NA)
  )

print(p1)



# P-value for overall distribution
counts <- vf_survey %>%
  filter(!is.na(signature_match)) %>%
  count(signature_match)
x <- counts$n
test_result <- prop.test(x = x, n = rep(sum(x), 2))
cat("P-value for overall distribution:", test_result$p.value, "\n")

##====================================##
## Descriptive Plot: By Trump Support ##
##====================================##

prop_data <- vf_survey %>%
  filter(!is.na(signature_match), !is.na(vote_choice)) %>%
  mutate(
    signature_match = factor(signature_match, levels = c(0, 1), labels = c("Match", "No Match")),
    vote_choice = factor(vote_choice, levels = c(0, 1), 
                         labels = c("Non-Trump supporter 2020", "Trump supporter 2020"))
  ) %>%
  count(vote_choice, signature_match) %>%
  group_by(vote_choice) %>%
  mutate(
    prop = n / sum(n),
    z = qnorm(0.975),
    total = sum(n),
    ci_lower = (prop + z^2/(2*total) - z*sqrt(prop*(1-prop)/total + z^2/(4*total^2))) / (1 + z^2/total),
    ci_upper = (prop + z^2/(2*total) + z*sqrt(prop*(1-prop)/total + z^2/(4*total^2))) / (1 + z^2/total)
  ) %>%
  ungroup()

p2 <- ggplot(prop_data, aes(x = signature_match, y = prop, fill = vote_choice)) +
  geom_col(position = position_dodge(width = 0.9), color = "white", alpha = 0.8, width = 0.8) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper),
                position = position_dodge(width = 0.9),
                width = 0.2, size = 0.8, color = "black") +
  geom_text(aes(label = paste0(round(prop * 100, 1), "%")),
            position = position_dodge(width = 0.9),
            vjust = -1.5, size = 4, fontface = "bold") +
  scale_y_continuous(labels = percent_format(),
                     breaks = seq(0, 1, 0.10),
                     limits = c(0, max(prop_data$ci_upper) * 1.15),
                     expand = expansion(mult = c(0, 0.05))) +
  scale_fill_manual(name = "",
                    values = c("Non-Trump supporter 2020" = "gray65", "Trump supporter 2020" = "gray25")) +
  labs(x = "Signature Match Status", y = "Percent") +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 12, face = "bold"),
    axis.text = element_text(size = 11),
    legend.position = "bottom",
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white", color = NA),
    plot.background = element_rect(fill = "white", color = NA)
  )

print(p2)


##============================##
## Logistic Regression Models ##
##============================##

# Convert back to data.frame for glm
vf_survey <- data.frame(vf_survey)

# Model 1: Basic demographics
m1 <- glm(signature_match ~ vote_choice + party_reg + ideo7 + white + age_cat +
                            female + education + CountyCode, 
                            data = vf_survey, family = binomial)

# Model 2: Add VBM vote 2020
m2 <- glm(signature_match ~ vote_choice + party_reg + ideo7 + white + age_cat +
                            female + education + vbm20_vf + CountyCode,
                            data = vf_survey, family = binomial)

# Model 3: Add VBM policy attitude
m3 <- glm(signature_match ~ vote_choice + party_reg + ideo7 + white + age_cat +
                            female + education + vbm20_vf + vbm_receive_who + CountyCode, 
                            data = vf_survey, family = binomial)

# Model 4: Add behavioral factors
m4 <- glm(signature_match ~ vote_choice + party_reg + ideo7 + white + age_cat +
                            female + education + vbm20_vf + vbm_receive_who + write_check +
                            disability_type + langugae_other_english + CountyCode,
                            data = vf_survey, family = binomial)

##==================================##
## County-Clustered Standard Errors ##
##==================================##

get_county_clustered_se <- function(model) {
  county_cluster <- vf_survey$CountyCode[as.numeric(rownames(model$model))]
  vcov_cl <- vcovCL(model, cluster = county_cluster)
  se_cl <- sqrt(diag(vcov_cl))
  coefs <- coef(model)
  t_stats <- coefs / se_cl
  p_vals <- 2 * (1 - pnorm(abs(t_stats)))
  return(list(se = se_cl, pval = p_vals))
}

robust_m1 <- get_county_clustered_se(m1)
robust_m2 <- get_county_clustered_se(m2)
robust_m3 <- get_county_clustered_se(m3)
robust_m4 <- get_county_clustered_se(m4)

##==============##
## Model Output ##
##==============##

map <- list(
  "vote_choice" = "Trump supporter 2020",
  "party_reg2" = "Republican",
  "party_reg3" = "Other party",
  "ideo7" = "Ideology (liberal to conservative)",
  "white" = "White",
  "age_cat2" = "Age 25-34",
  "age_cat3" = "Age 35-44",
  "age_cat4" = "Age 45-54",
  "age_cat5" = "Age 55-64",
  "age_cat6" = "Age 65+",
  "female" = "Female",
  "education2" = "Some college",
  "education3" = "College degree",
  "education4" = "Graduate degree",
  "vbm20_vf" = "VBM vote 2020",
  "vbm_receive_who" = "VBM excuse only",
  "write_check" = "Writes a check often",
  "disability_type1" = "Arthritis",
  "disability_type2" = "Vision loss",
  "langugae_other_english" = "Other language home",
  "(Intercept)" = "(Intercept)"
)

texreg(list(m1, m2, m3, m4),
       caption = "Political and Demographic Predictors of Signature Non-Match Perceptions",
       caption.above = TRUE,
       custom.coef.map = map,
       include.nobs = TRUE,
       custom.note = "%stars. County-clustered standard errors in parentheses.",
       dcolumn = TRUE,
       omit.coef = "CountyCode",
       custom.model.names = c("(1)", "(2)", "(3)", "(4)"),
       digits = 3,
       single.row = FALSE,
       override.se = list(robust_m1$se, robust_m2$se, robust_m3$se, robust_m4$se),
       override.pval = list(robust_m1$pval, robust_m2$pval, robust_m3$pval, robust_m4$pval))

##======================================##
## Average Marginal Effects for Model 4 ##
##======================================##

ame_results <- margins(m4, variables = c("vote_choice", "ideo7", "white", "age_cat", 
                                         "education", "vbm_receive_who"))
ame_summary <- summary(ame_results)

plot_data <- ame_summary %>%
  mutate(
    var_label = case_when(
      factor == "vote_choice" ~ "Trump supporter 2020",
      factor == "ideo7" ~ "Ideology (7-point)",
      factor == "white" ~ "White",
      factor == "age_cat2" ~ "Age 25-34",
      factor == "age_cat3" ~ "Age 35-44",
      factor == "age_cat4" ~ "Age 45-54",
      factor == "age_cat5" ~ "Age 55-64",
      factor == "age_cat6" ~ "Age 65+",
      factor == "education3" ~ "College degree",
      factor == "education4" ~ "Graduate degree",
      factor == "vbm_receive_who" ~ "VBM excuse only",
      TRUE ~ NA_character_
    ),
    var_label = factor(var_label, levels = rev(c(
      "Trump supporter 2020", "Ideology (7-point)", "White",
      "Age 25-34", "Age 35-44", "Age 45-54", "Age 55-64", "Age 65+",
      "College degree", "Graduate degree", "VBM excuse only"
    )))
  ) %>%
  filter(!is.na(AME) & !is.na(var_label))

p3 <- ggplot(plot_data, aes(x = var_label, y = AME)) +
  geom_point(size = 3, color = "black") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, color = "black") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red", alpha = 0.7) +
  scale_y_continuous(limits = c(-0.4, 0.4), breaks = seq(-0.4, 0.4, 0.1)) +
  coord_flip() +
  labs(x = "", y = "Average Marginal Effect") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 11),
    panel.grid.minor = element_blank()
  )

print(p3)


##========================##
## Descriptive Statistics ##
##========================##

vf_survey <- dummy_cols(vf_survey,
                        select_columns = c("party_reg", "age_cat", "education", "disability_type"),
                        remove_first_dummy = FALSE,
                        remove_selected_columns = FALSE)

vars <- c("vote_choice", "party_reg_1", "party_reg_2", "party_reg_3", "ideo7", "white",
          "age_cat_1", "age_cat_2", "age_cat_3", "age_cat_4", "age_cat_5", "age_cat_6",
          "education_1", "education_2", "education_3", "education_4",
          "disability_type_0", "disability_type_1", "disability_type_2",
          "female", "vbm20_vf", "vbm_receive_who", "write_check", "langugae_other_english")

result <- vf_survey %>%
  select(all_of(vars)) %>%
  mutate(across(everything(), as.numeric)) %>%
  summarise(across(everything(), list(
    N = ~sum(!is.na(.)),
    mean = ~round(mean(., na.rm = TRUE), 3),
    SD = ~round(sd(., na.rm = TRUE), 3),
    min = ~min(., na.rm = TRUE),
    max = ~max(., na.rm = TRUE)
  ), .names = "{.col}_{.fn}")) %>%
  pivot_longer(everything(), names_to = "var_stat", values_to = "value") %>%
  separate(var_stat, into = c("variable", "statistic"), sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = statistic, values_from = value)

print(result, n = Inf)